{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "672ddd5b-65d3-4c1d-99db-52175c7ac84c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import WikipediaLoader\n",
    " \n",
    "loader = WikipediaLoader(\"LangChain\")\n",
    "documents = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8af5d1ca-a249-4016-8a21-3ef0511135f7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b1bc01e6-8e8b-4299-9268-fc72e96e8492",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'LangChain is a framework designed to simplify the creation of applications using large language models (LLMs). As a language model integration framework, LangChain\\'s use-cases largely overlap with those of language models in general, including document analysis and summarization, chatbots, and code analysis.\\n\\n\\n== Background ==\\nLangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence. The project quickly garnered popularity, with improvements from hundreds of contributors on GitHub, trending discussions on Twitter, lively activity on the project\\'s Discord server, many YouTube tutorials, and meetups in San Francisco and London. In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark.\\n\\n\\n== Integrations ==\\nAs of March 2023, LangChain included integrations with systems including Amazon, Google, and Microsoft Azure cloud storage; API wrappers for news, movie information, and weather; Bash for summarization, syntax and semantics checking, and execution of shell scripts; multiple web scraping subsystems and templates; few-shot learning prompt generation support; finding and summarizing \"todo\" tasks in code; Google Drive documents, spreadsheets, and presentations summarization, extraction, and creation; Google Search and Microsoft Bing web search; OpenAI, Anthropic, and Hugging Face language models; iFixit repair guides and wikis search and summarization; MapReduce for question answering, combining documents, and question generation; N-gram overlap scoring; PyPDF, pdfminer, fitz, and pymupdf for PDF file text extraction and manipulation; Python and JavaScript code generation, analysis, and debugging; Weaviate vector database to cache embedding and data objects; Redis cache database storage; Python RequestsWrapper and other methods for API requests; SQL and NoSQL databases including JSON support; Streamlit, including for logging; text mapping for k-nearest neighbors search; time zone conversion and calendar operations; tracing and recording stack symbols in threaded and asynchronous subprocess runs; and the Wolfram Alpha website and SDK. As of April 2023, it can read from more than 50 document types and data sources.\\n\\n\\n== Further reading ==\\nBriggs, James; Ingham, Francisco (2023). \"LangChain: Introduction and Getting Started\". LangChain AI Handbook. Pinecone. Retrieved 2023-04-18.\\nBuniatyan, Davit (2023). \"Ultimate Guide to LangChain & Deep Lake: Build ChatGPT to Answer Questions on Your Financial Data\". Activeloop.\\n\\n\\n== References ==\\n\\n\\n== External links ==\\n\\nOfficial website\\nDiscord server support hub'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[0].page_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "048e3211-0e07-4aa2-a14e-7ee3dff14c2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import WebBaseLoader\n",
    "\n",
    "\n",
    "loader = WebBaseLoader(\"https://python.langchain.com/docs/modules/data_connection/document_loaders/\")\n",
    "\n",
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "563d1c98-1e57-4cae-911a-236a9bf7750c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 documents retrieved\n"
     ]
    }
   ],
   "source": [
    "from langchain.retrievers import PubMedRetriever  \n",
    "retriever = PubMedRetriever()  \n",
    "documents = retriever.get_relevant_documents(\"COVID\")\n",
    "print(f\"{len(documents)} documents retrieved\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5fa664e1-a45b-4064-a357-582a607e53ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The COVID-19 pandemic highlights the need for a psychological support in systemic sclerosis patients.\n",
      "Host genetic polymorphisms involved in long-term symptoms of COVID-19.\n",
      "Association Between COVID-19 Vaccination and Mortality after Major Operations.\n"
     ]
    }
   ],
   "source": [
    "for document in documents:\n",
    "    print(document.metadata[\"title\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "7658c188-7ed0-4c2f-881a-437bf487a39c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers import KNNRetriever  \n",
    "from langchain.embeddings import OpenAIEmbeddings  \n",
    "\n",
    "words = [\"cat\", \"dog\", \"computer\", \"animal\"]\n",
    "retriever = KNNRetriever.from_texts(words, OpenAIEmbeddings())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "f7ef85ff-c111-47e1-aa06-6c0c72034662",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='dog', metadata={}),\n",
       " Document(page_content='animal', metadata={}),\n",
       " Document(page_content='cat', metadata={}),\n",
       " Document(page_content='computer', metadata={})]"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever.get_relevant_documents(\"dog\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "d197fbfa-2575-414a-bf6f-032b62a36485",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 1536)"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings  \n",
    "\n",
    "words = [\"cat\", \"dog\", \"computer\", \"animal\"]\n",
    "embeddings = OpenAIEmbeddings()\n",
    "doc_vectors = embeddings.embed_documents(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "9cb2c2e7-037b-4ac8-97e4-eb379554eb47",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.spatial.distance import pdist, squareform\n",
    "X = np.array(doc_vectors)\n",
    "dists = squareform(pdist(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "56930288-043b-48f8-8724-79b275e6530a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_05807_row0_col0, #T_05807_row1_col1, #T_05807_row2_col2, #T_05807_row3_col3 {\n",
       "  background-color: #3b4cc0;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row0_col1 {\n",
       "  background-color: #d65244;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row0_col2 {\n",
       "  background-color: #bd1f2d;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row0_col3 {\n",
       "  background-color: #dc5d4a;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row1_col0 {\n",
       "  background-color: #d44e41;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row1_col2 {\n",
       "  background-color: #ba162b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row1_col3 {\n",
       "  background-color: #ec7f63;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row2_col0, #T_05807_row2_col1, #T_05807_row2_col3, #T_05807_row3_col2 {\n",
       "  background-color: #b40426;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row3_col0 {\n",
       "  background-color: #d55042;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_05807_row3_col1 {\n",
       "  background-color: #e97a5f;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_05807\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_05807_level0_col0\" class=\"col_heading level0 col0\" >cat</th>\n",
       "      <th id=\"T_05807_level0_col1\" class=\"col_heading level0 col1\" >dog</th>\n",
       "      <th id=\"T_05807_level0_col2\" class=\"col_heading level0 col2\" >computer</th>\n",
       "      <th id=\"T_05807_level0_col3\" class=\"col_heading level0 col3\" >animal</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_05807_level0_row0\" class=\"row_heading level0 row0\" >cat</th>\n",
       "      <td id=\"T_05807_row0_col0\" class=\"data row0 col0\" >0.000000</td>\n",
       "      <td id=\"T_05807_row0_col1\" class=\"data row0 col1\" >0.522352</td>\n",
       "      <td id=\"T_05807_row0_col2\" class=\"data row0 col2\" >0.575285</td>\n",
       "      <td id=\"T_05807_row0_col3\" class=\"data row0 col3\" >0.521214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_05807_level0_row1\" class=\"row_heading level0 row1\" >dog</th>\n",
       "      <td id=\"T_05807_row1_col0\" class=\"data row1 col0\" >0.522352</td>\n",
       "      <td id=\"T_05807_row1_col1\" class=\"data row1 col1\" >0.000000</td>\n",
       "      <td id=\"T_05807_row1_col2\" class=\"data row1 col2\" >0.581203</td>\n",
       "      <td id=\"T_05807_row1_col3\" class=\"data row1 col3\" >0.478794</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_05807_level0_row2\" class=\"row_heading level0 row2\" >computer</th>\n",
       "      <td id=\"T_05807_row2_col0\" class=\"data row2 col0\" >0.575285</td>\n",
       "      <td id=\"T_05807_row2_col1\" class=\"data row2 col1\" >0.581203</td>\n",
       "      <td id=\"T_05807_row2_col2\" class=\"data row2 col2\" >0.000000</td>\n",
       "      <td id=\"T_05807_row2_col3\" class=\"data row2 col3\" >0.591435</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_05807_level0_row3\" class=\"row_heading level0 row3\" >animal</th>\n",
       "      <td id=\"T_05807_row3_col0\" class=\"data row3 col0\" >0.521214</td>\n",
       "      <td id=\"T_05807_row3_col1\" class=\"data row3 col1\" >0.478794</td>\n",
       "      <td id=\"T_05807_row3_col2\" class=\"data row3 col2\" >0.591435</td>\n",
       "      <td id=\"T_05807_row3_col3\" class=\"data row3 col3\" >0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f8e87562710>"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    data=dists,\n",
    "    index=words,\n",
    "    columns=words\n",
    ")\n",
    "df.style.background_gradient(cmap='coolwarm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd81f24d-d5b7-4694-b42e-7f59c1b7e3c6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
